{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入模块\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-2-d389e8bbeddb>:14: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts)\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 打开微信公众号平台"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 登录过程（需扫码）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//a[@class=\"login__type__container__select-type\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"897783536@qq.com\", \"password\": \"Saberaimer99\"}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 输入账号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 输入密码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 打开菜单栏"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'展开'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"mp_header\"]/div/div/a')\n",
    "element.click()\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 图文素材"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"js_mp_sidemenu\"]/div/div/ul/li[2]/ul/li[1]/ul/li[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 新的创作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div/div[1]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 写新图文"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div/div[2]/ul/li[1]')\n",
    "\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 切换窗口"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "windows = driver.window_handles\n",
    "driver.switch_to.window(windows[-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击超链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//li[@id=\"js_editor_insertlink\"]') \n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 选择其他公众号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-btn weui-desktop-btn_default\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击搜索框"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 输入南方都市报"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input').clear()\n",
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input').send_keys('南方都市报')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击搜索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/span/button[2]/div')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击南方都市报"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div[2]/ul/li[1]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 循环爬取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 设置翻页时间"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "##设置翻页时间\n",
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "        #时间加长一倍\n",
    "        time.sleep(2+2*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages[0:40])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>16</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>17</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>18</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>19</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>21</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>22</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>23</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>24</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>26</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>27</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>29</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>30</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>31</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>32</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>33</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>34</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>35</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>36</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>37</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>38</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>39</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>40</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "6   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "7   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "8   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "9   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "10  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "11  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "13  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "14  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "15  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "16  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "17  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "18  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "19  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "20  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "21  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "22  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "23  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "24  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "25  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "26  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "27  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "28  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "29  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "30  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "31  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "32  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "33  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "34  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "35  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "36  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "37  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "38  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "39  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "40  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from lxml.html import fromstring"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "39\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[12]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[12,\n",
       " 41,\n",
       " 42,\n",
       " 43,\n",
       " 44,\n",
       " 45,\n",
       " 46,\n",
       " 47,\n",
       " 48,\n",
       " 49,\n",
       " 50,\n",
       " 51,\n",
       " 52,\n",
       " 53,\n",
       " 54,\n",
       " 55,\n",
       " 56,\n",
       " 57,\n",
       " 58,\n",
       " 59,\n",
       " 60,\n",
       " 61,\n",
       " 62,\n",
       " 63,\n",
       " 64,\n",
       " 65,\n",
       " 66,\n",
       " 67,\n",
       " 68,\n",
       " 69,\n",
       " 70,\n",
       " 71,\n",
       " 72,\n",
       " 73,\n",
       " 74,\n",
       " 75,\n",
       " 76,\n",
       " 77,\n",
       " 78,\n",
       " 79,\n",
       " 80,\n",
       " 81,\n",
       " 82,\n",
       " 83,\n",
       " 84,\n",
       " 85,\n",
       " 86,\n",
       " 87,\n",
       " 88,\n",
       " 89,\n",
       " 90,\n",
       " 91,\n",
       " 92,\n",
       " 93,\n",
       " 94,\n",
       " 95,\n",
       " 96,\n",
       " 97,\n",
       " 98,\n",
       " 99,\n",
       " 100,\n",
       " 101,\n",
       " 102,\n",
       " 103,\n",
       " 104,\n",
       " 105,\n",
       " 106,\n",
       " 107,\n",
       " 108,\n",
       " 109,\n",
       " 110,\n",
       " 111,\n",
       " 112,\n",
       " 113,\n",
       " 114,\n",
       " 115,\n",
       " 116,\n",
       " 117,\n",
       " 118,\n",
       " 119,\n",
       " 120,\n",
       " 121,\n",
       " 122,\n",
       " 123,\n",
       " 124,\n",
       " 125,\n",
       " 126,\n",
       " 127,\n",
       " 128,\n",
       " 129,\n",
       " 130,\n",
       " 131,\n",
       " 132,\n",
       " 133,\n",
       " 134,\n",
       " 135,\n",
       " 136,\n",
       " 137,\n",
       " 138,\n",
       " 139,\n",
       " 140,\n",
       " 141,\n",
       " 142,\n",
       " 143,\n",
       " 144,\n",
       " 145,\n",
       " 146,\n",
       " 147,\n",
       " 148,\n",
       " 149,\n",
       " 150,\n",
       " 151,\n",
       " 152,\n",
       " 153,\n",
       " 154,\n",
       " 155,\n",
       " 156,\n",
       " 157,\n",
       " 158,\n",
       " 159,\n",
       " 160,\n",
       " 161,\n",
       " 162,\n",
       " 163,\n",
       " 164,\n",
       " 165,\n",
       " 166,\n",
       " 167,\n",
       " 168,\n",
       " 169,\n",
       " 170,\n",
       " 171,\n",
       " 172,\n",
       " 173,\n",
       " 174,\n",
       " 175,\n",
       " 176,\n",
       " 177,\n",
       " 178,\n",
       " 179,\n",
       " 180,\n",
       " 181,\n",
       " 182,\n",
       " 183,\n",
       " 184,\n",
       " 185,\n",
       " 186,\n",
       " 187,\n",
       " 188,\n",
       " 189,\n",
       " 190,\n",
       " 191,\n",
       " 192,\n",
       " 193,\n",
       " 194,\n",
       " 195,\n",
       " 196,\n",
       " 197,\n",
       " 198,\n",
       " 199,\n",
       " 200,\n",
       " 201,\n",
       " 202,\n",
       " 203,\n",
       " 204,\n",
       " 205,\n",
       " 206,\n",
       " 207,\n",
       " 208,\n",
       " 209,\n",
       " 210,\n",
       " 211,\n",
       " 212,\n",
       " 213,\n",
       " 214,\n",
       " 215,\n",
       " 216,\n",
       " 217,\n",
       " 218,\n",
       " 219,\n",
       " 220,\n",
       " 221,\n",
       " 222,\n",
       " 223,\n",
       " 224,\n",
       " 225,\n",
       " 226,\n",
       " 227,\n",
       " 228,\n",
       " 229,\n",
       " 230,\n",
       " 231,\n",
       " 232,\n",
       " 233,\n",
       " 234,\n",
       " 235,\n",
       " 236,\n",
       " 237,\n",
       " 238,\n",
       " 239,\n",
       " 240,\n",
       " 241,\n",
       " 242,\n",
       " 243,\n",
       " 244,\n",
       " 245,\n",
       " 246,\n",
       " 247,\n",
       " 248,\n",
       " 249,\n",
       " 250,\n",
       " 251,\n",
       " 252,\n",
       " 253,\n",
       " 254,\n",
       " 255,\n",
       " 256,\n",
       " 257,\n",
       " 258,\n",
       " 259,\n",
       " 260,\n",
       " 261,\n",
       " 262,\n",
       " 263,\n",
       " 264,\n",
       " 265,\n",
       " 266,\n",
       " 267,\n",
       " 268,\n",
       " 269,\n",
       " 270,\n",
       " 271,\n",
       " 272,\n",
       " 273,\n",
       " 274,\n",
       " 275,\n",
       " 276,\n",
       " 277,\n",
       " 278,\n",
       " 279,\n",
       " 280,\n",
       " 281,\n",
       " 282,\n",
       " 283,\n",
       " 284,\n",
       " 285,\n",
       " 286,\n",
       " 287,\n",
       " 288,\n",
       " 289,\n",
       " 290,\n",
       " 291,\n",
       " 292,\n",
       " 293,\n",
       " 294,\n",
       " 295,\n",
       " 296,\n",
       " 297,\n",
       " 298,\n",
       " 299,\n",
       " 300,\n",
       " 301,\n",
       " 302,\n",
       " 303,\n",
       " 304,\n",
       " 305,\n",
       " 306,\n",
       " 307,\n",
       " 308,\n",
       " 309,\n",
       " 310,\n",
       " 311,\n",
       " 312,\n",
       " 313,\n",
       " 314,\n",
       " 315,\n",
       " 316,\n",
       " 317,\n",
       " 318,\n",
       " 319,\n",
       " 320,\n",
       " 321,\n",
       " 322,\n",
       " 323,\n",
       " 324,\n",
       " 325,\n",
       " 326,\n",
       " 327,\n",
       " 328,\n",
       " 329,\n",
       " 330,\n",
       " 331,\n",
       " 332,\n",
       " 333,\n",
       " 334,\n",
       " 335,\n",
       " 336,\n",
       " 337,\n",
       " 338,\n",
       " 339,\n",
       " 340,\n",
       " 341,\n",
       " 342,\n",
       " 343,\n",
       " 344,\n",
       " 345,\n",
       " 346,\n",
       " 347,\n",
       " 348,\n",
       " 349,\n",
       " 350,\n",
       " 351,\n",
       " 352,\n",
       " 353,\n",
       " 354,\n",
       " 355,\n",
       " 356,\n",
       " 357,\n",
       " 358,\n",
       " 359,\n",
       " 360,\n",
       " 361,\n",
       " 362,\n",
       " 363,\n",
       " 364,\n",
       " 365,\n",
       " 366,\n",
       " 367,\n",
       " 368,\n",
       " 369,\n",
       " 370,\n",
       " 371,\n",
       " 372,\n",
       " 373,\n",
       " 374,\n",
       " 375,\n",
       " 376,\n",
       " 377,\n",
       " 378,\n",
       " 379,\n",
       " 380,\n",
       " 381,\n",
       " 382,\n",
       " 383,\n",
       " 384,\n",
       " 385,\n",
       " 386,\n",
       " 387,\n",
       " 388,\n",
       " 389,\n",
       " 390,\n",
       " 391,\n",
       " 392,\n",
       " 393,\n",
       " 394,\n",
       " 395,\n",
       " 396,\n",
       " 397,\n",
       " 398,\n",
       " 399,\n",
       " 400,\n",
       " 401,\n",
       " 402,\n",
       " 403,\n",
       " 404,\n",
       " 405,\n",
       " 406,\n",
       " 407,\n",
       " 408,\n",
       " 409,\n",
       " 410,\n",
       " 411,\n",
       " 412,\n",
       " 413,\n",
       " 414,\n",
       " 415,\n",
       " 416,\n",
       " 417,\n",
       " 418,\n",
       " 419,\n",
       " 420,\n",
       " 421,\n",
       " 422,\n",
       " 423,\n",
       " 424,\n",
       " 425,\n",
       " 426,\n",
       " 427,\n",
       " 428,\n",
       " 429,\n",
       " 430,\n",
       " 431,\n",
       " 432,\n",
       " 433,\n",
       " 434,\n",
       " 435,\n",
       " 436,\n",
       " 437,\n",
       " 438,\n",
       " 439,\n",
       " 440,\n",
       " 441,\n",
       " 442,\n",
       " 443,\n",
       " 444,\n",
       " 445,\n",
       " 446,\n",
       " 447,\n",
       " 448,\n",
       " 449,\n",
       " 450,\n",
       " 451,\n",
       " 452,\n",
       " 453,\n",
       " 454,\n",
       " 455,\n",
       " 456,\n",
       " 457,\n",
       " 458,\n",
       " 459,\n",
       " 460,\n",
       " 461,\n",
       " 462,\n",
       " 463,\n",
       " 464,\n",
       " 465,\n",
       " 466,\n",
       " 467,\n",
       " 468,\n",
       " 469,\n",
       " 470,\n",
       " 471,\n",
       " 472,\n",
       " 473,\n",
       " 474,\n",
       " 475,\n",
       " 476,\n",
       " 477,\n",
       " 478,\n",
       " 479,\n",
       " 480,\n",
       " 481,\n",
       " 482,\n",
       " 483,\n",
       " 484,\n",
       " 485,\n",
       " 486,\n",
       " 487,\n",
       " 488,\n",
       " 489,\n",
       " 490,\n",
       " 491,\n",
       " 492,\n",
       " 493,\n",
       " 494,\n",
       " 495,\n",
       " 496,\n",
       " 497,\n",
       " 498,\n",
       " 499,\n",
       " 500,\n",
       " 501,\n",
       " 502,\n",
       " 503,\n",
       " 504,\n",
       " 505,\n",
       " 506,\n",
       " 507,\n",
       " 508,\n",
       " 509,\n",
       " 510,\n",
       " 511,\n",
       " 512,\n",
       " 513,\n",
       " 514,\n",
       " 515,\n",
       " 516,\n",
       " 517,\n",
       " 518,\n",
       " 519,\n",
       " 520,\n",
       " 521,\n",
       " 522,\n",
       " 523,\n",
       " 524,\n",
       " 525,\n",
       " 526,\n",
       " 527,\n",
       " 528,\n",
       " 529,\n",
       " 530,\n",
       " 531,\n",
       " 532,\n",
       " 533,\n",
       " 534,\n",
       " 535,\n",
       " 536,\n",
       " 537,\n",
       " 538,\n",
       " 539,\n",
       " 540,\n",
       " 541,\n",
       " 542,\n",
       " 543,\n",
       " 544,\n",
       " 545,\n",
       " 546,\n",
       " 547,\n",
       " 548,\n",
       " 549,\n",
       " 550,\n",
       " 551,\n",
       " 552,\n",
       " 553,\n",
       " 554,\n",
       " 555,\n",
       " 556,\n",
       " 557,\n",
       " 558,\n",
       " 559,\n",
       " 560,\n",
       " 561,\n",
       " 562,\n",
       " 563,\n",
       " 564,\n",
       " 565,\n",
       " 566,\n",
       " 567,\n",
       " 568,\n",
       " 569,\n",
       " 570,\n",
       " 571,\n",
       " 572,\n",
       " 573,\n",
       " 574,\n",
       " 575,\n",
       " 576,\n",
       " 577,\n",
       " 578,\n",
       " 579,\n",
       " 580,\n",
       " 581,\n",
       " 582,\n",
       " 583,\n",
       " 584,\n",
       " 585,\n",
       " 586,\n",
       " 587,\n",
       " 588,\n",
       " 589,\n",
       " 590,\n",
       " 591,\n",
       " 592,\n",
       " 593,\n",
       " 594,\n",
       " 595,\n",
       " 596,\n",
       " 597,\n",
       " 598,\n",
       " 599,\n",
       " 600,\n",
       " 601,\n",
       " 602,\n",
       " 603,\n",
       " 604,\n",
       " 605,\n",
       " 606,\n",
       " 607,\n",
       " 608,\n",
       " 609,\n",
       " 610,\n",
       " 611,\n",
       " 612,\n",
       " 613,\n",
       " 614,\n",
       " 615,\n",
       " 616,\n",
       " 617,\n",
       " 618,\n",
       " 619,\n",
       " 620,\n",
       " 621,\n",
       " 622,\n",
       " 623,\n",
       " 624,\n",
       " 625,\n",
       " 626,\n",
       " 627,\n",
       " 628,\n",
       " 629,\n",
       " 630,\n",
       " 631,\n",
       " 632,\n",
       " 633,\n",
       " 634,\n",
       " 635,\n",
       " 636,\n",
       " 637,\n",
       " 638,\n",
       " 639,\n",
       " 640,\n",
       " 641,\n",
       " 642,\n",
       " 643,\n",
       " 644,\n",
       " 645,\n",
       " 646,\n",
       " 647,\n",
       " 648,\n",
       " 649,\n",
       " 650,\n",
       " 651,\n",
       " 652,\n",
       " 653,\n",
       " 654,\n",
       " 655,\n",
       " 656,\n",
       " 657,\n",
       " 658,\n",
       " 659,\n",
       " 660,\n",
       " 661,\n",
       " 662,\n",
       " 663,\n",
       " 664,\n",
       " 665,\n",
       " 666,\n",
       " 667,\n",
       " 668,\n",
       " 669,\n",
       " 670,\n",
       " 671,\n",
       " 672,\n",
       " 673,\n",
       " 674,\n",
       " 675,\n",
       " 676,\n",
       " 677,\n",
       " 678,\n",
       " 679,\n",
       " 680,\n",
       " 681,\n",
       " 682,\n",
       " 683,\n",
       " 684,\n",
       " 685,\n",
       " 686,\n",
       " 687,\n",
       " 688,\n",
       " 689,\n",
       " 690,\n",
       " 691,\n",
       " 692,\n",
       " 693,\n",
       " 694,\n",
       " 695,\n",
       " 696,\n",
       " 697,\n",
       " 698,\n",
       " 699,\n",
       " 700,\n",
       " 701,\n",
       " 702,\n",
       " 703,\n",
       " 704,\n",
       " 705,\n",
       " 706,\n",
       " 707,\n",
       " 708,\n",
       " 709,\n",
       " 710,\n",
       " 711,\n",
       " 712,\n",
       " 713,\n",
       " 714,\n",
       " 715,\n",
       " 716,\n",
       " 717,\n",
       " 718,\n",
       " 719,\n",
       " 720,\n",
       " 721,\n",
       " 722,\n",
       " 723,\n",
       " 724,\n",
       " 725,\n",
       " 726,\n",
       " 727,\n",
       " 728,\n",
       " 729,\n",
       " 730,\n",
       " 731,\n",
       " 732,\n",
       " 733,\n",
       " 734,\n",
       " 735,\n",
       " 736,\n",
       " 737,\n",
       " 738,\n",
       " 739,\n",
       " 740,\n",
       " 741,\n",
       " 742,\n",
       " 743,\n",
       " 744,\n",
       " 745,\n",
       " 746,\n",
       " 747,\n",
       " 748,\n",
       " 749,\n",
       " 750,\n",
       " 751,\n",
       " 752,\n",
       " 753,\n",
       " 754,\n",
       " 755,\n",
       " 756,\n",
       " 757,\n",
       " 758,\n",
       " 759,\n",
       " 760,\n",
       " 761,\n",
       " 762,\n",
       " 763,\n",
       " 764,\n",
       " 765,\n",
       " 766,\n",
       " 767,\n",
       " 768,\n",
       " 769,\n",
       " 770,\n",
       " 771,\n",
       " 772,\n",
       " 773,\n",
       " 774,\n",
       " 775,\n",
       " 776,\n",
       " 777,\n",
       " 778,\n",
       " 779,\n",
       " 780,\n",
       " 781,\n",
       " 782,\n",
       " 783,\n",
       " 784,\n",
       " 785,\n",
       " 786,\n",
       " 787,\n",
       " 788,\n",
       " 789,\n",
       " 790,\n",
       " 791,\n",
       " 792,\n",
       " 793,\n",
       " 794,\n",
       " 795,\n",
       " 796,\n",
       " 797,\n",
       " 798,\n",
       " 799,\n",
       " 800,\n",
       " 801,\n",
       " 802,\n",
       " 803,\n",
       " 804,\n",
       " 805,\n",
       " 806,\n",
       " 807,\n",
       " 808,\n",
       " 809,\n",
       " 810,\n",
       " 811,\n",
       " 812,\n",
       " 813,\n",
       " 814,\n",
       " 815,\n",
       " 816,\n",
       " 817,\n",
       " 818,\n",
       " 819,\n",
       " 820,\n",
       " 821,\n",
       " 822,\n",
       " 823,\n",
       " 824,\n",
       " 825,\n",
       " 826,\n",
       " 827,\n",
       " 828,\n",
       " 829,\n",
       " 830,\n",
       " 831,\n",
       " 832,\n",
       " 833,\n",
       " 834,\n",
       " 835,\n",
       " 836,\n",
       " 837,\n",
       " 838,\n",
       " 839,\n",
       " 840,\n",
       " 841,\n",
       " 842,\n",
       " 843,\n",
       " 844,\n",
       " 845,\n",
       " 846,\n",
       " 847,\n",
       " 848,\n",
       " 849,\n",
       " 850,\n",
       " 851,\n",
       " 852,\n",
       " 853,\n",
       " 854,\n",
       " 855,\n",
       " 856,\n",
       " 857,\n",
       " 858,\n",
       " 859,\n",
       " 860,\n",
       " 861,\n",
       " 862,\n",
       " 863,\n",
       " 864,\n",
       " 865,\n",
       " 866,\n",
       " 867,\n",
       " 868,\n",
       " 869,\n",
       " 870,\n",
       " 871,\n",
       " 872,\n",
       " 873,\n",
       " 874,\n",
       " 875,\n",
       " 876,\n",
       " 877,\n",
       " 878,\n",
       " 879,\n",
       " 880,\n",
       " 881,\n",
       " 882,\n",
       " 883,\n",
       " 884,\n",
       " 885,\n",
       " 886,\n",
       " 887,\n",
       " 888,\n",
       " 889,\n",
       " 890,\n",
       " 891,\n",
       " 892,\n",
       " 893,\n",
       " 894,\n",
       " 895,\n",
       " 896,\n",
       " 897,\n",
       " 898,\n",
       " 899,\n",
       " 900,\n",
       " 901,\n",
       " 902,\n",
       " 903,\n",
       " 904,\n",
       " 905,\n",
       " 906,\n",
       " 907,\n",
       " 908,\n",
       " 909,\n",
       " 910,\n",
       " 911,\n",
       " 912,\n",
       " 913,\n",
       " 914,\n",
       " 915,\n",
       " 916,\n",
       " 917,\n",
       " 918,\n",
       " 919,\n",
       " 920,\n",
       " 921,\n",
       " 922,\n",
       " 923,\n",
       " 924,\n",
       " 925,\n",
       " 926,\n",
       " 927,\n",
       " 928,\n",
       " 929,\n",
       " 930,\n",
       " 931,\n",
       " 932,\n",
       " 933,\n",
       " 934,\n",
       " 935,\n",
       " 936,\n",
       " 937,\n",
       " 938,\n",
       " 939,\n",
       " 940,\n",
       " 941,\n",
       " 942,\n",
       " 943,\n",
       " 944,\n",
       " 945,\n",
       " 946,\n",
       " 947,\n",
       " 948,\n",
       " 949,\n",
       " 950,\n",
       " 951,\n",
       " 952,\n",
       " 953,\n",
       " 954,\n",
       " 955,\n",
       " 956,\n",
       " 957,\n",
       " 958,\n",
       " 959,\n",
       " 960,\n",
       " 961,\n",
       " 962,\n",
       " 963,\n",
       " 964,\n",
       " 965,\n",
       " 966,\n",
       " 967,\n",
       " 968,\n",
       " 969,\n",
       " 970,\n",
       " 971,\n",
       " 972,\n",
       " 973,\n",
       " 974,\n",
       " 975,\n",
       " 976,\n",
       " 977,\n",
       " 978,\n",
       " 979,\n",
       " 980,\n",
       " 981,\n",
       " 982,\n",
       " 983,\n",
       " 984,\n",
       " 985,\n",
       " 986,\n",
       " 987,\n",
       " 988,\n",
       " 989,\n",
       " 990,\n",
       " 991,\n",
       " 992,\n",
       " 993,\n",
       " 994,\n",
       " 995,\n",
       " 996,\n",
       " 997,\n",
       " 998,\n",
       " 999,\n",
       " 1000,\n",
       " 1001,\n",
       " 1002,\n",
       " 1003,\n",
       " 1004,\n",
       " 1005,\n",
       " 1006,\n",
       " 1007,\n",
       " 1008,\n",
       " 1009,\n",
       " 1010,\n",
       " 1011,\n",
       " 1012,\n",
       " 1013,\n",
       " 1014,\n",
       " 1015,\n",
       " 1016,\n",
       " 1017,\n",
       " 1018,\n",
       " 1019,\n",
       " 1020,\n",
       " 1021,\n",
       " 1022,\n",
       " 1023,\n",
       " 1024,\n",
       " 1025,\n",
       " 1026,\n",
       " 1027,\n",
       " 1028,\n",
       " 1029,\n",
       " 1030,\n",
       " 1031,\n",
       " 1032,\n",
       " 1033,\n",
       " 1034,\n",
       " 1035,\n",
       " 1036,\n",
       " 1037,\n",
       " 1038,\n",
       " 1039,\n",
       " ...]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = '南方都市报tsv'\n",
    "df_out.to_csv(filename, sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 爬取页面内容，标题、时间、链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40,40,39,40,40,40,40,40,40,40,40,40,40,39,39,40,40,40,40,40,40,40,39,40,40,40,40,40,40,40,40,40,39,40,40,40,40,40,40,40,"
     ]
    }
   ],
   "source": [
    "def get_content(link):\n",
    "    session = HTMLSession()\n",
    "    r = session.get(url=link)\n",
    "    content_xpath_1 = '//*[@id=\"js_content\"]//span/text()'\n",
    "    content_xpath_2 = '//*[@id=\"js_content\"]//p/text()'\n",
    "    content_1 = ''.join(r.html.xpath(content_xpath_1))\n",
    "    content_2 = ''.join(r.html.xpath(content_xpath_2))\n",
    "    return content_1 + content_2\n",
    "\n",
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x for x in root.xpath('//div[@class=\"inner_link_article_title\"]//span[2]/text()')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    content_text = [get_content(x) for x in link]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link, \"content_text\":content_text})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages[0:40]:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>广州刚刚通报！新增本土11+2！轨迹披露</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今天（1日），广州卫健委官网发布“2021年5月31日广州市新冠肺炎疫情情况”。据通报，20...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>周知！离开佛山最新规定！</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今早（6月1日），广东佛山市新型冠状病毒肺炎疫情防控指挥部发布“关于加强离开佛山人员和车辆管...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>这就是佛山速度！72小时，全城紧急行动！一幕幕让人动容</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>5月27日一则通知↓↓广东佛山通知！南海区、禅城区先后发出公告对南海部分社区和禅城区所有社区...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>广州顶硬上！！！</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>5月30日晚，广州市新型冠状病毒肺炎疫情防控指挥部发布《关于加强离穗车辆和人员管理的通告（第...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>最高能白拿3万美元，还有这种好事？！</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>发钱了？！？！最高能“白拿”3万美元？还有这种“好事”？！？！？发钱资助中国？美国所能想到的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1590</td>\n",
       "      <td>最新！江西一例本土无症状感染者转为确诊病例</td>\n",
       "      <td>2021-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今日（27日）凌晨，江西卫健委披露一例本土无症状感染者转为新冠肺炎境外输入病例的关联确诊病例...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1591</td>\n",
       "      <td>BCI上海代表处发声明了</td>\n",
       "      <td>2021-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>近日，瑞士良好棉花发展协会（BCI）成为舆论焦点。3月26日， BCI上海代表处发表声明，称...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1592</td>\n",
       "      <td>水滴互助宣布关停！</td>\n",
       "      <td>2021-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>继运营了五年的轻松互助关停后，也在上线5周年纪念日前夕关停。昨日（3月26日），水滴互助发布...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1593</td>\n",
       "      <td>一南一北，两官员同天被通报！</td>\n",
       "      <td>2021-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>3月25日，据中央纪委国家监委网站消息，江西省发改委原党组成员、副主任被开除党籍和公职，北京...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1594</td>\n",
       "      <td>中消协点名餐厅仅扫码点餐</td>\n",
       "      <td>2021-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>如今，越来越多的餐厅采取扫码点餐的方式，纸质菜单反而成了“稀缺品”。南都记者调查发现，扫码点...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1595 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            title create_time  \\\n",
       "0            广州刚刚通报！新增本土11+2！轨迹披露  2021-06-01   \n",
       "1                    周知！离开佛山最新规定！  2021-06-01   \n",
       "2     这就是佛山速度！72小时，全城紧急行动！一幕幕让人动容  2021-06-01   \n",
       "3                        广州顶硬上！！！  2021-06-01   \n",
       "4              最高能白拿3万美元，还有这种好事？！  2021-06-01   \n",
       "...                           ...         ...   \n",
       "1590        最新！江西一例本土无症状感染者转为确诊病例  2021-03-27   \n",
       "1591                 BCI上海代表处发声明了  2021-03-27   \n",
       "1592                    水滴互助宣布关停！  2021-03-27   \n",
       "1593               一南一北，两官员同天被通报！  2021-03-27   \n",
       "1594                 中消协点名餐厅仅扫码点餐  2021-03-27   \n",
       "\n",
       "                                                   link  \\\n",
       "0     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "3     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "4     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "...                                                 ...   \n",
       "1590  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1591  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1592  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1593  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1594  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                           content_text  \n",
       "0     今天（1日），广州卫健委官网发布“2021年5月31日广州市新冠肺炎疫情情况”。据通报，20...  \n",
       "1     今早（6月1日），广东佛山市新型冠状病毒肺炎疫情防控指挥部发布“关于加强离开佛山人员和车辆管...  \n",
       "2     5月27日一则通知↓↓广东佛山通知！南海区、禅城区先后发出公告对南海部分社区和禅城区所有社区...  \n",
       "3     5月30日晚，广州市新型冠状病毒肺炎疫情防控指挥部发布《关于加强离穗车辆和人员管理的通告（第...  \n",
       "4     发钱了？！？！最高能“白拿”3万美元？还有这种“好事”？！？！？发钱资助中国？美国所能想到的...  \n",
       "...                                                 ...  \n",
       "1590  今日（27日）凌晨，江西卫健委披露一例本土无症状感染者转为新冠肺炎境外输入病例的关联确诊病例...  \n",
       "1591  近日，瑞士良好棉花发展协会（BCI）成为舆论焦点。3月26日， BCI上海代表处发表声明，称...  \n",
       "1592  继运营了五年的轻松互助关停后，也在上线5周年纪念日前夕关停。昨日（3月26日），水滴互助发布...  \n",
       "1593  3月25日，据中央纪委国家监委网站消息，江西省发改委原党组成员、副主任被开除党籍和公职，北京...  \n",
       "1594  如今，越来越多的餐厅采取扫码点餐的方式，纸质菜单反而成了“稀缺品”。南都记者调查发现，扫码点...  \n",
       "\n",
       "[1595 rows x 4 columns]"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 输出数据，导出保存为excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter('{公众号}公众号链接及文章内容.xlsx'.format(公众号=公众号),mode='w',engine=\"openpyxl\") as writer:  \n",
    "df_url_out.to_excel(writer, sheet_name=公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
